#Setup
First we’ll load some packages, including the recent modelr for easy modeling, setting options to warn us whenever observations with missing values are ignored by our models.
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 0.8.5
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(modelr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
theme_set(theme_bw())
options(repr.plot.width=4, repr.plot.height=3)
Then we’ll load a data frame of the number of total trips taken by Citibike riders for each day in 2014, along with the weather on each day.
trips_per_day <- read_tsv('trips_per_day.tsv')
## Parsed with column specification:
## cols(
## ymd = col_date(format = ""),
## num_trips = col_double(),
## date = col_double(),
## prcp = col_double(),
## snwd = col_double(),
## snow = col_double(),
## tmax = col_double(),
## tmin = col_double()
## )
holidays <- read_csv("holidays.csv", col_names = c('sr','ymd','name')) %>% select(ymd) %>% mutate(holiday=1)
## Parsed with column specification:
## cols(
## sr = col_double(),
## ymd = col_date(format = ""),
## name = col_character()
## )
#sun <- read_csv("sun.csv") %>% as.Date(ymd, format = "%m/%d/%Y")
trips_per_day <- left_join(trips_per_day, holidays, by="ymd") %>% mutate(holiday = ifelse(is.na(holiday), 0, 1))
trips_per_day <- trips_per_day %>% mutate(weekdays=as.factor(weekdays(ymd))) %>% mutate(isWeekend = ifelse(weekdays=="Saturday"|weekdays=="Sunday", 1, 0))
#trips_per_day <- left_join(trips_per_day, sun, by="ymd")
head(trips_per_day)
Let’s plot the number of trips taken as a function of the minimum temperature on each day.
ggplot(trips_per_day, aes(x = tmin, y = num_trips)) +
geom_point() +
xlab('Minimum temperature') +
ylab('Daily trips') +
scale_y_continuous()
#Cross-validation
Now we’ll try fitting different polynomials to this data, and use cross-validation to find the polynomial degree that generalizes best to held out data.
First we’ll shuffle the data and make an 80% train and 20% validation split.
#RNGkind(sample.kind = "Rounding")
set.seed(42)
num_days <- nrow(trips_per_day)
frac_train <- 0.8
frac_val <- 0.1
frac_test <- 0.1
num_train <- floor(num_days * frac_train)
num_val <- floor(num_days * frac_val)
num_test <- num_days - num_train - num_val
# randomly sample rows for the training set
ndx <- sample(1:num_days, num_train, replace=F)
ndv <- sample(1:num_days, num_val, replace = F)
ndt <- sample(1:num_days, num_test, replace =F)
# used to fit the model
trips_per_day_train <- trips_per_day[ndx, ]
# used to evaluate the fit
trips_per_day_validate <- trips_per_day[ndv, ]
# used to test the fit
trips_per_day_test <- trips_per_day[ndt, ]
Now we’ll evaluate models from degree 1 up through degree 8. For each we’ll fit on the training data and evaluate on the validation data.
# fit a model for each polynomial degree
K <- 1:8
train_err <- c()
validate_err <- c()
test_err <- c()
for (k in K) {
# fit on the training data
model <- lm(num_trips ~ poly(tmin, k, raw = T) + holiday + snwd * isWeekend + prcp * isWeekend + snow * isWeekend + poly(tmax, k, raw = T) + weekdays, data=trips_per_day_train)
# evaluate on the training data
train_err[k] <- sqrt(mean((predict(model, trips_per_day_train) - trips_per_day_train$num_trips)^2))
# evaluate on the validate data
validate_err[k] <- sqrt(mean((predict(model, trips_per_day_validate) - trips_per_day_validate$num_trips)^2))
#test_err[k] <- sqrt(mean((predict(model, trips_per_day_test) - trips_per_day_test$num_trips)^2))
}
## Warning in predict.lm(model, trips_per_day_train): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_validate): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_train): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_validate): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_train): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_validate): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_train): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_validate): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_train): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_validate): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_train): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_validate): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_train): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_validate): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_train): prediction from a rank-
## deficient fit may be misleading
## Warning in predict.lm(model, trips_per_day_validate): prediction from a rank-
## deficient fit may be misleading
summary(model)
##
## Call:
## lm(formula = num_trips ~ poly(tmin, k, raw = T) + holiday + snwd *
## isWeekend + prcp * isWeekend + snow * isWeekend + poly(tmax,
## k, raw = T) + weekdays, data = trips_per_day_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11010.0 -1339.4 316.9 1826.0 12218.1
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.372e+05 4.139e+05 -2.264 0.0244 *
## poly(tmin, k, raw = T)1 2.068e+04 6.054e+04 0.342 0.7329
## poly(tmin, k, raw = T)2 -3.157e+04 8.512e+04 -0.371 0.7110
## poly(tmin, k, raw = T)3 2.440e+04 6.006e+04 0.406 0.6849
## poly(tmin, k, raw = T)4 -1.019e+04 2.383e+04 -0.428 0.6691
## poly(tmin, k, raw = T)5 2.440e+03 5.549e+03 0.440 0.6605
## poly(tmin, k, raw = T)6 -3.352e+02 7.520e+02 -0.446 0.6561
## poly(tmin, k, raw = T)7 2.460e+01 5.487e+01 0.448 0.6542
## poly(tmin, k, raw = T)8 -7.480e-01 1.666e+00 -0.449 0.6538
## holiday -1.012e+04 1.181e+03 -8.570 9.10e-16 ***
## snwd -4.198e+02 9.291e+01 -4.518 9.44e-06 ***
## isWeekend -6.570e+03 7.621e+02 -8.621 6.40e-16 ***
## prcp -9.174e+03 7.964e+02 -11.520 < 2e-16 ***
## snow 2.379e+02 2.873e+02 0.828 0.4085
## poly(tmax, k, raw = T)1 1.790e+06 7.880e+05 2.272 0.0239 *
## poly(tmax, k, raw = T)2 -1.423e+06 6.183e+05 -2.302 0.0221 *
## poly(tmax, k, raw = T)3 6.188e+05 2.656e+05 2.330 0.0206 *
## poly(tmax, k, raw = T)4 -1.611e+05 6.856e+04 -2.350 0.0195 *
## poly(tmax, k, raw = T)5 2.578e+04 1.092e+04 2.361 0.0190 *
## poly(tmax, k, raw = T)6 -2.483e+03 1.052e+03 -2.360 0.0190 *
## poly(tmax, k, raw = T)7 1.318e+02 5.611e+01 2.350 0.0195 *
## poly(tmax, k, raw = T)8 -2.966e+00 1.273e+00 -2.330 0.0206 *
## weekdaysMonday -1.076e+03 7.542e+02 -1.427 0.1547
## weekdaysSaturday 4.464e+02 7.390e+02 0.604 0.5464
## weekdaysSunday NA NA NA NA
## weekdaysThursday 6.596e+02 7.272e+02 0.907 0.3652
## weekdaysTuesday -8.679e+02 7.646e+02 -1.135 0.2574
## weekdaysWednesday 7.267e+02 7.339e+02 0.990 0.3230
## snwd:isWeekend 2.476e+02 1.480e+02 1.673 0.0956 .
## isWeekend:prcp 5.504e+02 1.455e+03 0.378 0.7055
## isWeekend:snow -9.409e+02 1.787e+03 -0.526 0.5991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3187 on 262 degrees of freedom
## Multiple R-squared: 0.9149, Adjusted R-squared: 0.9055
## F-statistic: 97.15 on 29 and 262 DF, p-value: < 2.2e-16
Now we’ll plot the training and validation error as a function of the polynomial degree.
plot_data <- data.frame(K, train_err, validate_err) %>%
gather("split", "error", -K)
ggplotly(ggplot(plot_data, aes(x=K, y=error, color=split)) +
geom_line() +
scale_x_continuous(breaks=K) +
xlab('Polynomial Degree') +
ylab('RMSE'))
Although the training error decreases as we increase the degree, the test error bottoms out at for a fifth degree polynomial.
Let’s re-fit this model on all of the data and plot the final result.
model <- lm(num_trips ~ poly(tmin, 6, raw = T) + holiday + snwd * isWeekend + prcp * isWeekend + snow * isWeekend + poly(tmax, 6, raw = T) + weekdays, data=trips_per_day_train)
rmse(model,trips_per_day_train)
## Warning in predict.lm(model, data): prediction from a rank-deficient fit may be
## misleading
## [1] 3063.699
rsquare(model,trips_per_day_train)
## Warning in predict.lm(model, data): prediction from a rank-deficient fit may be
## misleading
## [1] 0.9123868
rmse(model,trips_per_day_validate)
## Warning in predict.lm(model, data): prediction from a rank-deficient fit may be
## misleading
## [1] 3131.872
rsquare(model,trips_per_day_validate)
## Warning in predict.lm(model, data): prediction from a rank-deficient fit may be
## misleading
## [1] 0.9033179
rmse(model,trips_per_day_test)
## Warning in predict.lm(model, data): prediction from a rank-deficient fit may be
## misleading
## [1] 3123.685
rsquare(model,trips_per_day_test)
## Warning in predict.lm(model, data): prediction from a rank-deficient fit may be
## misleading
## [1] 0.9166786
trips_per_day_train <- trips_per_day_train %>%
add_predictions(model) %>%
mutate(split = "train")
## Warning in predict.lm(model, data): prediction from a rank-deficient fit may be
## misleading
trips_per_day_validate <- trips_per_day_validate %>%
add_predictions(model) %>%
mutate(split = "validate")
## Warning in predict.lm(model, data): prediction from a rank-deficient fit may be
## misleading
trips_per_day_test <- trips_per_day_test %>%
add_predictions(model) %>%
mutate(split = "test")
## Warning in predict.lm(model, data): prediction from a rank-deficient fit may be
## misleading
plot_data <- bind_rows(trips_per_day_train, trips_per_day_validate, trips_per_day_test)
ggplotly(ggplot(plot_data, aes(x = pred, y = num_trips)) +
geom_point(aes(color = split)) +
geom_line(aes(y = pred)) +
xlab('Predicted number of daily trips') +
ylab('Actual number of daily trips') +
scale_y_continuous())
ggplotly(ggplot(plot_data, aes(x = ymd, y = num_trips)) +
geom_point(aes(color = split)) +
geom_line(aes(y = pred)) +
xlab('Day of the year') +
ylab('Daily trips') +
scale_y_continuous())
We’re done at this point, with one important exception.
If we’d like to quote how well we expect this model to do on future data, we should use a final, held out test set that we touch only once to make this assessment. (Reusing the validation set would give an optimistic estimate, as our modeling process has already seen that data in the cross-validation process.)